if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("curatedTCGAData");
Update all/some/none? [a/s/n]: 
n
BiocManager::install("TCGAutils");
Update all/some/none? [a/s/n]: 
n
BiocManager::install("TCGAbiolinks");
Update all/some/none? [a/s/n]: 
n

install.packages("SNFtool");
Error in install.packages : Updating loaded packages
install.packages("NetPreProc");
Error in install.packages : Updating loaded packages
library("curatedTCGAData");
library("TCGAbiolinks");
library("TCGAutils");

library("SNFtool")
library("NetPreProc");
library("cluster");

1 Download Prostate adenocarcinoma dataset


assays <- c("miRNASeqGene", "RNASeq2Gene", "RPPAArray");
mo <- curatedTCGAData(diseaseCode = "PRAD", 
                        assays = assays,
                        version = "2.1.1", dry.run=FALSE);
snapshotDate(): 2023-10-24
Working on: PRAD_RNASeq2Gene-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Working on: PRAD_RPPAArray-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Working on: PRAD_miRNASeqGene-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Working on: PRAD_colData-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Working on: PRAD_sampleMap-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Working on: PRAD_metadata-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
harmonizing input:
  removing 5189 sampleMap rows not in names(experiments)
mo
A MultiAssayExperiment object of 3 listed
 experiments with user-defined names and respective classes.
 Containing an ExperimentList class object of length 3:
 [1] PRAD_RNASeq2Gene-20160128: SummarizedExperiment with 20501 rows and 550 columns
 [2] PRAD_RPPAArray-20160128: SummarizedExperiment with 195 rows and 352 columns
 [3] PRAD_miRNASeqGene-20160128: SummarizedExperiment with 1046 rows and 547 columns
Functionality:
 experiments() - obtain the ExperimentList instance
 colData() - the primary/phenotype DataFrame
 sampleMap() - the sample coordination DataFrame
 `$`, `[`, `[[` - extract colData columns, subset, or experiment
 *Format() - convert into a long or wide DataFrame
 assays() - convert ExperimentList to a SimpleList of matrices
 exportClass() - save data to flat files

2 Data pre-processing

Consider only primary solid tumors Analyte information missing from PRAD_RPPAArray (20th position) that causes inconsistent barcode lengths in MultiArrayExperiment object mo (but doesn’t cause problems for the code below).

primary <- TCGAutils::TCGAsampleSelect(colnames(mo), c("01"));
mo <- mo[,primary,]

Check for replicates for same patient (first 12 characters) No replicates present in mo.

check_rep <- anyReplicated(mo);
print(check_rep)
 PRAD_RNASeq2Gene-20160128    PRAD_RPPAArray-20160128 PRAD_miRNASeqGene-20160128 
                     FALSE                      FALSE                      FALSE 

Remove FFPE samples (frozen tissue is better preseved)

no_ffpe <- which(as.data.frame(colData(mo))$patient.samples.sample.is_ffpe == "no");
as.data.frame(colData(mo))
mo <- mo[, no_ffpe, ];

Consider samples having all considered omics data

complete <- intersectColumns(mo);

ExpressionList → simple list of matrices

complete <- assays(complete)
print(dim(complete[[1]]))
[1] 20501   348

Transpose all three matrices, obtaining samples \(\times\) features

complete <- lapply(complete,FUN=t)

Remove features having missing values (no missing values in the current dataset)

for (i in 1:length(complete)){
  #print(dim(complete[[i]]))
  complete[[i]] <- complete[[i]][,colSums(is.na(complete[[i]]))==0]
  #print(dim(complete[[i]]))
}

Select features having more variance across samples because they bring more information.

nf <- 100;
for(i in 1:length(complete)){
    
    idx <- caret::nearZeroVar(complete[[i]])
    message(paste("Removed ", length(idx), "features from", names(complete)[i]));
    if(length(idx) != 0){
        complete[[i]] <- complete[[i]][, -idx];
    }

    if(ncol(complete[[i]]) <= nf) next
    
    vars <- apply(complete[[i]], 2, var);
    idx <- sort(vars, index.return=TRUE, decreasing = TRUE)$ix;
    
    complete[[i]] <- complete[[i]][, idx[1:nf]];
    
}
Removed  1334 features from PRAD_RNASeq2Gene-20160128
Removed  0 features from PRAD_RPPAArray-20160128
Removed  471 features from PRAD_miRNASeqGene-20160128

Standardize features with z-score

zscore <- function(data){
    
    zscore_vec <- function(x) { return ((x - mean(x)) / sd(x))}
    data <- apply(data, 2, zscore_vec)
    
    
    return(data)
}

complete <- lapply(complete, zscore);

Clean barcodes retaining only “Project-TSS-Participant” (first 12 characters)

for(v in 1:length(complete)){
    rownames(complete[[v]]) <- substr(rownames(complete[[v]]), 1, 12);
}

3 Download disease subtypes

subtypes <- as.data.frame(TCGAbiolinks::PanCancerAtlas_subtypes())

subtypes <- subtypes[subtypes$cancer.type == "PRAD", ];
# Retain only primary solid tumors and select samples in common with omics data
# (in the same order):
subtypes <- subtypes[TCGAutils::TCGAsampleSelect(subtypes$pan.samplesID, "01"), ];
sub_select <- substr(subtypes$pan.samplesID,1,12) %in% rownames(complete[[1]]);
subtypes <- subtypes[sub_select, ];

We consider only samples in the omics data in complete for which also subtype data is present in subtypes

rownames(subtypes) <- substr(subtypes$pan.samplesID, 1, 12);
for (i in 1:length(complete))
  complete[[i]] <- complete[[i]][rownames(subtypes),]
# Print number of samples for each subtype:
table(subtypes$Subtype_Integrative);

  1   2   3 
 60  83 105 

3.1 Check subtype & omics data order

count <- 0
for(i in 1:length(rownames(subtypes))){
  if(rownames(subtypes)[i] != rownames(complete[[1]])[i])
     count <- count + 1
}
count
[1] 0

All patients/samples are in the same order in both the datasets. # Multi-omics data integration Similarity matrices for each data source using exponential euclidian distance (affinity matrix).

3.2 Integration through SNF

Integration of the matrices using Similarity Network Fusion t number of iterations _K_ number of neighbours to consider to compute local similarity matrix

3.3 Integration through average

Integration through average of matrices

for (i in 1:length(W_list))
    plot(W_list[[i]],xlim=c(0,0.01),ylim=c(0,0.01))

4 Disease subtype discovery with PAM

Number of clusters.

4.1 Each data source

Convert similarity matrix into a distance matrix. The similarities are normalized in the range &$ using min-max normalization before conversion into distances.

# Convert similarity matrix into a distance matrix. The similarities
# are normalized in the range [0, 1] using min-max normalization before
# conversion into distances.
dist <- list()
D <- list()
for (i in 1:length((W_list))){
  dist[[i]] <- 1 - NetPreProc::Max.Min.norm(W_list[[i]])
  D[[i]] <- as.dist(dist[[i]])
}
for (i in 1:length((W_list)))
  pam.res <- pam(D[[i]], k=k)

4.2 Integrated data through mean

dist_mean <- 1 - NetPreProc::Max.Min.norm(W_int_mean);
D_mean <- as.dist(dist_mean); 

pam.res <- pam(D_mean, k=k);

4.3 Integrated data through SNF

dist_SNF <- 1 - NetPreProc::Max.Min.norm(W_int_SNF);
D_SNF <- as.dist(dist_SNF); 

# Apply clustering algorithms on integrated matrix:
pam.res <- pam(D_SNF, k=k);

#Disease subtype discovery with spectral clustering (Consider distance matrix calculated above) ## Integrated data through SNF

k <- length(unique(subtypes$Subtype_Integrative));
sc.res <- SNFtool::spectralClustering(W_int_SNF, K=k)
#to have uniform type
pam.sc.res <- pam(sc.res,k=k)
sessionInfo()
---
title: "Bioinformatics project year 2023/2024"
author: "Lucia Anna Mellini"
date:
#bibliography: references.bib
output: 
    html_notebook:
        toc: true
        number_sections: true
        toc_float: true
        theme: cerulean
        fig_caption: true
---


```{r message=FALSE, warning=FALSE}
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("curatedTCGAData");
BiocManager::install("TCGAutils");
BiocManager::install("TCGAbiolinks");

install.packages("SNFtool");
install.packages("NetPreProc");
```

```{r message=FALSE, warning=FALSE}
library("curatedTCGAData");
library("TCGAbiolinks");
library("TCGAutils");

library("SNFtool")
library("NetPreProc");
library("cluster");
```
# Download Prostate adenocarcinoma dataset#
<!-- miRNASeqGene              Gene-level log2 RPM miRNA expression values
<>RNASeq2Gene               RSEM TPM gene expression values
<>RPPAArray                 Reverse Phase Protein Array normalized protein expression values -->
```{r}

assays <- c("miRNASeqGene", "RNASeq2Gene", "RPPAArray");
mo <- curatedTCGAData(diseaseCode = "PRAD", 
                        assays = assays,
                        version = "2.1.1", dry.run=FALSE);
```

```{r message=TRUE}
mo
```
# Data pre-processing
Consider only primary solid tumors
Analyte information missing from PRAD_RPPAArray (20th position) that causes inconsistent barcode lengths in MultiArrayExperiment object  ```mo``` (but doesn't cause problems for the code below).
```{r message=FALSE, warning=FALSE}
primary <- TCGAutils::TCGAsampleSelect(colnames(mo), c("01"));
mo <- mo[,primary,]
```
Check for replicates for same patient (first 12 characters)
No replicates present in ```mo```.
```{r}
check_rep <- anyReplicated(mo);
print(check_rep)
```
Remove FFPE samples (frozen tissue is better preseved)
```{r}
no_ffpe <- which(as.data.frame(colData(mo))$patient.samples.sample.is_ffpe == "no");
as.data.frame(colData(mo))
mo <- mo[, no_ffpe, ];
```
Consider samples having all considered omics data
```{r}
complete <- intersectColumns(mo);
```
ExpressionList → simple list of matrices
```{r}
complete <- assays(complete)
print(dim(complete[[1]]))
```
Transpose all three matrices, obtaining samples $\times$ features
```{r}
complete <- lapply(complete,FUN=t)
```
Remove features having missing values (no missing values in the current dataset)
```{r}
for (i in 1:length(complete)){
  #print(dim(complete[[i]]))
  complete[[i]] <- complete[[i]][,colSums(is.na(complete[[i]]))==0]
  #print(dim(complete[[i]]))
}
```
Select features having more variance across samples because they bring more information.
```{r}
nf <- 100;
for(i in 1:length(complete)){
    
    idx <- caret::nearZeroVar(complete[[i]])
    message(paste("Removed ", length(idx), "features from", names(complete)[i]));
    if(length(idx) != 0){
        complete[[i]] <- complete[[i]][, -idx];
    }

    if(ncol(complete[[i]]) <= nf) next
    
    vars <- apply(complete[[i]], 2, var);
    idx <- sort(vars, index.return=TRUE, decreasing = TRUE)$ix;
    
    complete[[i]] <- complete[[i]][, idx[1:nf]];
    
}
```

Standardize features with z-score
```{r}
zscore <- function(data){
    
    zscore_vec <- function(x) { return ((x - mean(x)) / sd(x))}
    data <- apply(data, 2, zscore_vec)
    
    
    return(data)
}

complete <- lapply(complete, zscore);
```
Clean barcodes retaining only "Project-TSS-Participant" (first 12 characters)
```{r}
for(v in 1:length(complete)){
    rownames(complete[[v]]) <- substr(rownames(complete[[v]]), 1, 12);
}
```
# Download disease subtypes
```{r}
subtypes <- as.data.frame(TCGAbiolinks::PanCancerAtlas_subtypes())

subtypes <- subtypes[subtypes$cancer.type == "PRAD", ];
# Retain only primary solid tumors and select samples in common with omics data
# (in the same order):
subtypes <- subtypes[TCGAutils::TCGAsampleSelect(subtypes$pan.samplesID, "01"), ];
sub_select <- substr(subtypes$pan.samplesID,1,12) %in% rownames(complete[[1]]);
subtypes <- subtypes[sub_select, ];
```
We consider only samples in the omics data in ```complete``` for which also subtype data is present in ```subtypes```
```{r}
rownames(subtypes) <- substr(subtypes$pan.samplesID, 1, 12);
for (i in 1:length(complete))
  complete[[i]] <- complete[[i]][rownames(subtypes),]
# Print number of samples for each subtype:
table(subtypes$Subtype_Integrative);
```
## Check subtype & omics data order
```{r}
count <- 0
for(i in 1:length(rownames(subtypes))){
  if(rownames(subtypes)[i] != rownames(complete[[1]])[i])
     count <- count + 1
}
count
```
All patients/samples are in the same order in both the datasets.
# Multi-omics data integration
Similarity matrices for each data source using exponential euclidian distance (affinity matrix).
```{r}
W_list <- list();
for(i in 1:length(complete)){
    Dist <- (dist2(as.matrix(complete[[i]]), as.matrix(complete[[i]])))^(1/2)
    W_list[[i]] <- affinityMatrix(Dist)
}
```
## Integration through SNF
Integration of the matrices using Similarity Network Fusion
*_t_ number of iterations
*_K_ number of neighbours to consider to compute local similarity matrix

```{r}
W_int_SNF <- SNF(W_list, K=20, t=20);
```
## Integration through average
Integration through average of matrices
```{r}
W_int_mean <- Reduce('+', W_list)/length(W_list)
```
```{r}
for (i in 1:length(W_list))
    plot(W_list[[i]],xlim=c(0,0.01),ylim=c(0,0.01))
```

```{r}
plot(W_int_mean,xlim=c(0,0.01),ylim=c(0,0.01))
```
```{r}
plot(W_int_SNF,xlim=c(0,0.01),ylim=c(0,0.01))
```
# Disease subtype discovery with PAM
Number of clusters.
```{r}
k <- length(unique(subtypes$Subtype_Integrative));
```
## Each data source
Convert similarity matrix into a distance matrix. The similarities are normalized in the range &\left[0, 1\right]$ using min-max normalization before conversion into distances.
```{r}
dist <- list()
D <- list()
for (i in 1:length((W_list))){
  dist[[i]] <- 1 - NetPreProc::Max.Min.norm(W_list[[i]])
  D[[i]] <- as.dist(dist[[i]])
}
for (i in 1:length((W_list)))
  pam.res <- pam(D[[i]], k=k)
```
## Integrated data through mean
```{r}
dist_mean <- 1 - NetPreProc::Max.Min.norm(W_int_mean);
D_mean <- as.dist(dist_mean); 

pam.res <- pam(D_mean, k=k);
```
## Integrated data through SNF
```{r}
dist_SNF <- 1 - NetPreProc::Max.Min.norm(W_int_SNF);
D_SNF <- as.dist(dist_SNF); 

# Apply clustering algorithms on integrated matrix:
pam.res <- pam(D_SNF, k=k);
```
#Disease subtype discovery with spectral clustering
(Consider distance matrix calculated above)
## Integrated data through SNF
```{r}
k <- length(unique(subtypes$Subtype_Integrative));
sc.res <- SNFtool::spectralClustering(W_int_SNF, K=k)
#to have uniform type
pam.sc.res <- pam(sc.res,k=k)
```


```{r}
sessionInfo()
```

